# Emilie Helen Wolf
# November 2, 2021
# This project demonstrates my understanding of
# classified machine learning using logistic regression and decision trees.
The increasing number of cancellations calls for a Machine Learning based solution that can help in predicting which booking is likely to be canceled. INN Hotels Group has a chain of hotels in Portugal, they are facing problems with the high number of booking cancellations and have reached out to your firm for data-driven solutions. You as a data scientist have to analyze the data provided to find which factors have a high influence on booking cancellations, build a predictive model that can predict which booking is going to be canceled in advance, and help in formulating profitable policies for cancellations and refunds.
# Load library to handle holidays
from workalendar.europe import Portugal
cal = Portugal()
cal.holidays(2018)
[(datetime.date(2018, 1, 1), 'New year'), (datetime.date(2018, 3, 30), 'Good Friday'), (datetime.date(2018, 4, 1), 'Easter Sunday'), (datetime.date(2018, 4, 25), 'Dia da Liberdade'), (datetime.date(2018, 5, 1), 'Dia do Trabalhador'), (datetime.date(2018, 5, 31), 'Corpus Christi'), (datetime.date(2018, 6, 10), 'Dia de Portugal'), (datetime.date(2018, 8, 15), 'Assunção de Nossa Senhora'), (datetime.date(2018, 10, 5), 'Implantação da República'), (datetime.date(2018, 11, 1), 'Todos os santos'), (datetime.date(2018, 12, 1), 'Restauração da Independência'), (datetime.date(2018, 12, 8), 'Imaculada Conceição'), (datetime.date(2018, 12, 25), 'Christmas Day')]
# this will help in making the Python code more structured automatically (good coding practice)
%load_ext nb_black
import warnings
warnings.filterwarnings("ignore")
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter("ignore", ConvergenceWarning)
# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np
np.random.seed(55)
# Library to split data
from sklearn.model_selection import train_test_split
# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context("talk")
# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)
# To build model for prediction
import statsmodels.stats.api as sms
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import LogisticRegression
# To get different metric scores
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
roc_auc_score,
plot_confusion_matrix,
precision_recall_curve,
roc_curve,
make_scorer,
)
# Libraries to build decision tree classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
# To tune different models
from sklearn.model_selection import GridSearchCV
# Read the CSV file and make a copy
inn = pd.read_csv("INNHotelsGroup.csv")
df = inn.copy()
# Any rows with missing values?
df.isnull().any(axis=1).sum()
0
# Check the shape and data types
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36275 entries, 0 to 36274 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Booking_ID 36275 non-null object 1 no_of_adults 36275 non-null int64 2 no_of_children 36275 non-null int64 3 no_of_weekend_nights 36275 non-null int64 4 no_of_week_nights 36275 non-null int64 5 type_of_meal_plan 36275 non-null object 6 required_car_parking_space 36275 non-null int64 7 room_type_reserved 36275 non-null object 8 lead_time 36275 non-null int64 9 arrival_year 36275 non-null int64 10 arrival_month 36275 non-null int64 11 arrival_date 36275 non-null int64 12 market_segment_type 36275 non-null object 13 repeated_guest 36275 non-null int64 14 no_of_previous_cancellations 36275 non-null int64 15 no_of_previous_bookings_not_canceled 36275 non-null int64 16 avg_price_per_room 36275 non-null float64 17 no_of_special_requests 36275 non-null int64 18 booking_status 36275 non-null object dtypes: float64(1), int64(13), object(5) memory usage: 5.3+ MB
# Take a look at 20 random rows
df.sample(n=20, random_state=1)
| Booking_ID | no_of_adults | no_of_children | no_of_weekend_nights | no_of_week_nights | type_of_meal_plan | required_car_parking_space | room_type_reserved | lead_time | arrival_year | arrival_month | arrival_date | market_segment_type | repeated_guest | no_of_previous_cancellations | no_of_previous_bookings_not_canceled | avg_price_per_room | no_of_special_requests | booking_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 30392 | INN30393 | 1 | 0 | 1 | 0 | Not Selected | 0 | Room_Type 1 | 53 | 2018 | 9 | 11 | Online | 0 | 0 | 0 | 94.32 | 0 | Not_Canceled |
| 6685 | INN06686 | 2 | 0 | 1 | 2 | Meal Plan 1 | 0 | Room_Type 1 | 63 | 2018 | 4 | 22 | Online | 0 | 0 | 0 | 105.30 | 1 | Canceled |
| 8369 | INN08370 | 2 | 0 | 2 | 3 | Meal Plan 1 | 0 | Room_Type 4 | 55 | 2018 | 9 | 11 | Online | 0 | 0 | 0 | 106.24 | 0 | Not_Canceled |
| 2055 | INN02056 | 2 | 0 | 0 | 2 | Not Selected | 0 | Room_Type 1 | 53 | 2017 | 12 | 29 | Online | 0 | 0 | 0 | 81.00 | 1 | Not_Canceled |
| 10969 | INN10970 | 1 | 0 | 2 | 4 | Meal Plan 1 | 0 | Room_Type 1 | 245 | 2018 | 7 | 6 | Offline | 0 | 0 | 0 | 110.00 | 0 | Canceled |
| 24881 | INN24882 | 2 | 0 | 3 | 7 | Meal Plan 1 | 0 | Room_Type 2 | 231 | 2018 | 8 | 1 | Online | 0 | 0 | 0 | 81.82 | 2 | Canceled |
| 28658 | INN28659 | 2 | 0 | 0 | 3 | Meal Plan 2 | 0 | Room_Type 1 | 71 | 2018 | 5 | 10 | Offline | 0 | 0 | 0 | 126.00 | 1 | Not_Canceled |
| 20853 | INN20854 | 2 | 0 | 1 | 2 | Meal Plan 1 | 0 | Room_Type 1 | 66 | 2017 | 10 | 9 | Offline | 0 | 0 | 0 | 75.00 | 0 | Canceled |
| 8501 | INN08502 | 2 | 0 | 0 | 3 | Meal Plan 1 | 0 | Room_Type 2 | 40 | 2018 | 1 | 14 | Online | 0 | 0 | 0 | 77.55 | 1 | Not_Canceled |
| 1942 | INN01943 | 2 | 0 | 0 | 2 | Meal Plan 1 | 0 | Room_Type 1 | 63 | 2018 | 8 | 9 | Online | 0 | 0 | 0 | 144.90 | 2 | Not_Canceled |
| 15648 | INN15649 | 2 | 0 | 2 | 4 | Meal Plan 1 | 0 | Room_Type 1 | 209 | 2018 | 7 | 2 | Online | 0 | 0 | 0 | 66.53 | 1 | Not_Canceled |
| 6116 | INN06117 | 2 | 0 | 0 | 1 | Meal Plan 1 | 0 | Room_Type 1 | 9 | 2018 | 7 | 6 | Online | 0 | 0 | 0 | 139.00 | 1 | Not_Canceled |
| 7868 | INN07869 | 2 | 0 | 2 | 4 | Meal Plan 1 | 0 | Room_Type 4 | 123 | 2018 | 5 | 22 | Online | 0 | 0 | 0 | 114.75 | 1 | Canceled |
| 24527 | INN24528 | 2 | 0 | 0 | 3 | Meal Plan 1 | 0 | Room_Type 1 | 118 | 2018 | 6 | 28 | Online | 0 | 0 | 0 | 96.30 | 0 | Canceled |
| 24227 | INN24228 | 2 | 0 | 1 | 0 | Meal Plan 1 | 0 | Room_Type 4 | 72 | 2018 | 10 | 9 | Online | 0 | 0 | 0 | 132.30 | 3 | Not_Canceled |
| 17216 | INN17217 | 2 | 0 | 0 | 1 | Meal Plan 1 | 0 | Room_Type 1 | 1 | 2018 | 11 | 23 | Online | 0 | 0 | 0 | 120.00 | 0 | Not_Canceled |
| 31124 | INN31125 | 1 | 0 | 0 | 1 | Meal Plan 1 | 0 | Room_Type 1 | 7 | 2017 | 8 | 28 | Corporate | 1 | 1 | 2 | 65.00 | 0 | Not_Canceled |
| 9101 | INN09102 | 1 | 0 | 2 | 1 | Meal Plan 1 | 0 | Room_Type 1 | 12 | 2018 | 10 | 2 | Online | 0 | 0 | 0 | 6.00 | 0 | Not_Canceled |
| 9474 | INN09475 | 2 | 0 | 0 | 2 | Meal Plan 2 | 0 | Room_Type 1 | 63 | 2017 | 9 | 4 | Offline | 0 | 0 | 0 | 116.00 | 0 | Not_Canceled |
| 12782 | INN12783 | 2 | 0 | 0 | 3 | Not Selected | 0 | Room_Type 1 | 18 | 2018 | 1 | 21 | Online | 0 | 0 | 0 | 77.00 | 1 | Not_Canceled |
# Which rows are of object type?
objcols = df.dtypes[df.dtypes == "object"].index.tolist()
print(objcols)
['Booking_ID', 'type_of_meal_plan', 'room_type_reserved', 'market_segment_type', 'booking_status']
# Look at all the objects and their unique values (discrete variables) as percentages
num_to_display = 10
for colname in objcols:
val_counts = (
df[colname].value_counts(normalize=True).mul(100).round(2).astype(str) + "%"
)
print(val_counts[:num_to_display])
if len(val_counts) > num_to_display:
print(f"Only displaying first {num_to_display} of {len(val_counts)} values.")
print("\n")
INN31596 0.0% INN04762 0.0% INN24925 0.0% INN22855 0.0% INN17784 0.0% INN09054 0.0% INN26967 0.0% INN16533 0.0% INN18221 0.0% INN05403 0.0% Name: Booking_ID, dtype: object Only displaying first 10 of 36275 values. Meal Plan 1 76.73% Not Selected 14.14% Meal Plan 2 9.11% Meal Plan 3 0.01% Name: type_of_meal_plan, dtype: object Room_Type 1 77.55% Room_Type 4 16.7% Room_Type 6 2.66% Room_Type 2 1.91% Room_Type 5 0.73% Room_Type 7 0.44% Room_Type 3 0.02% Name: room_type_reserved, dtype: object Online 63.99% Offline 29.02% Corporate 5.56% Complementary 1.08% Aviation 0.34% Name: market_segment_type, dtype: object Not_Canceled 67.24% Canceled 32.76% Name: booking_status, dtype: object
# Are all the ID numbers unique? No duplicates?
df["Booking_ID"].nunique() == len(df)
True
# For all the repeat and non-repeat customers, are the previous bookings as expected?
df.groupby("repeated_guest")[
["no_of_previous_cancellations", "no_of_previous_bookings_not_canceled"]
].sum()
| no_of_previous_cancellations | no_of_previous_bookings_not_canceled | |
|---|---|---|
| repeated_guest | ||
| 0 | 0 | 0 |
| 1 | 847 | 5565 |
# For all repeated guests, are there any with a sum of zero in previous bookings?
df.loc[df["repeated_guest"] == 1][
["no_of_previous_cancellations", "no_of_previous_bookings_not_canceled"]
].sum(axis=1).eq(0).any()
False
# For all non-repeat guests, all previous bookings should add up to zero
df.loc[df["repeated_guest"] == 0][
["no_of_previous_cancellations", "no_of_previous_bookings_not_canceled"]
].sum(axis=1).sum()
0
repeated_guest before modeling since it is redundant with previous bookings# View value counts of repeated_guest
df["repeated_guest"].value_counts(normalize=True).mul(100).round(2).astype(str) + "%"
0 97.44% 1 2.56% Name: repeated_guest, dtype: object
2.6% of bookings are by repeat guests
# Statistical summary of numeric types
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| no_of_adults | 36275.0 | 1.844962 | 0.518715 | 0.0 | 2.0 | 2.00 | 2.0 | 4.0 |
| no_of_children | 36275.0 | 0.105279 | 0.402648 | 0.0 | 0.0 | 0.00 | 0.0 | 10.0 |
| no_of_weekend_nights | 36275.0 | 0.810724 | 0.870644 | 0.0 | 0.0 | 1.00 | 2.0 | 7.0 |
| no_of_week_nights | 36275.0 | 2.204300 | 1.410905 | 0.0 | 1.0 | 2.00 | 3.0 | 17.0 |
| required_car_parking_space | 36275.0 | 0.030986 | 0.173281 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| lead_time | 36275.0 | 85.232557 | 85.930817 | 0.0 | 17.0 | 57.00 | 126.0 | 443.0 |
| arrival_year | 36275.0 | 2017.820427 | 0.383836 | 2017.0 | 2018.0 | 2018.00 | 2018.0 | 2018.0 |
| arrival_month | 36275.0 | 7.423653 | 3.069894 | 1.0 | 5.0 | 8.00 | 10.0 | 12.0 |
| arrival_date | 36275.0 | 15.596995 | 8.740447 | 1.0 | 8.0 | 16.00 | 23.0 | 31.0 |
| repeated_guest | 36275.0 | 0.025637 | 0.158053 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| no_of_previous_cancellations | 36275.0 | 0.023349 | 0.368331 | 0.0 | 0.0 | 0.00 | 0.0 | 13.0 |
| no_of_previous_bookings_not_canceled | 36275.0 | 0.153411 | 1.754171 | 0.0 | 0.0 | 0.00 | 0.0 | 58.0 |
| avg_price_per_room | 36275.0 | 103.423539 | 35.089424 | 0.0 | 80.3 | 99.45 | 120.0 | 540.0 |
| no_of_special_requests | 36275.0 | 0.619655 | 0.786236 | 0.0 | 0.0 | 0.00 | 1.0 | 5.0 |
avg_price_per_room which has a min of 0Booking_ID can be dropped right awayarrival_year, arrival_month, and arrival_date into one datetime columnrepeated_guest and required_car_parking_space are already one-hot encodedrepeated_guest for modeling because it has collinearity with previous bookings# Drop Booking_ID
df.drop(["Booking_ID"], axis=1, inplace=True)
# Merge all the dates into a new column called arrival_dt
df["arrival_dt"] = (
df["arrival_year"].astype(str)
+ "-"
+ df["arrival_month"].astype(str)
+ "-"
+ df["arrival_date"].astype(str)
)
# Try to convert arrival_date to datetime
###### ERROR #######
# pd.to_datetime(df["arrival_dt"])
###### ERROR #######
# ParserError: day is out of range for month: 2018-2-29
# Find ALL the datetime errors using try-except
count = 0
for i in df["arrival_dt"]:
try:
pd.to_datetime(i)
except:
print(i)
count += 1
print(count) # print the number of errors
2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 2018-2-29 37
We are getting 37 errors because 2018 was NOT a leap year, so Febrary 29, 2018 is not a valid date.
# Create a subset dataframe that excludes 2018-2-29
subset = df.loc[df["arrival_dt"] != "2018-2-29"]
# Now convert arrival_dt type from string to datetime
subset["arrival_dt"] = pd.to_datetime(subset["arrival_dt"])
# No errors!
# Create a new column called length_of_stay by adding weekend and week nights
subset["length_of_stay"] = subset["no_of_week_nights"] + subset["no_of_weekend_nights"]
# Create a departure_datetime column by adding arrival_date and the number of days in length_of_stay
temp = subset["length_of_stay"].apply(lambda x: pd.Timedelta(x, unit="D"))
subset["departure_dt"] = subset["arrival_dt"] + temp
subset.head()
| no_of_adults | no_of_children | no_of_weekend_nights | no_of_week_nights | type_of_meal_plan | required_car_parking_space | room_type_reserved | lead_time | arrival_year | arrival_month | arrival_date | market_segment_type | repeated_guest | no_of_previous_cancellations | no_of_previous_bookings_not_canceled | avg_price_per_room | no_of_special_requests | booking_status | arrival_dt | length_of_stay | departure_dt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 0 | 1 | 2 | Meal Plan 1 | 0 | Room_Type 1 | 224 | 2017 | 10 | 2 | Offline | 0 | 0 | 0 | 65.00 | 0 | Not_Canceled | 2017-10-02 | 3 | 2017-10-05 |
| 1 | 2 | 0 | 2 | 3 | Not Selected | 0 | Room_Type 1 | 5 | 2018 | 11 | 6 | Online | 0 | 0 | 0 | 106.68 | 1 | Not_Canceled | 2018-11-06 | 5 | 2018-11-11 |
| 2 | 1 | 0 | 2 | 1 | Meal Plan 1 | 0 | Room_Type 1 | 1 | 2018 | 2 | 28 | Online | 0 | 0 | 0 | 60.00 | 0 | Canceled | 2018-02-28 | 3 | 2018-03-03 |
| 3 | 2 | 0 | 0 | 2 | Meal Plan 1 | 0 | Room_Type 1 | 211 | 2018 | 5 | 20 | Online | 0 | 0 | 0 | 100.00 | 0 | Canceled | 2018-05-20 | 2 | 2018-05-22 |
| 4 | 2 | 0 | 1 | 1 | Not Selected | 0 | Room_Type 1 | 48 | 2018 | 4 | 11 | Online | 0 | 0 | 0 | 94.50 | 0 | Canceled | 2018-04-11 | 2 | 2018-04-13 |
# Create 2 new columns where we manually count weekend nights and week nights between arrival and departure dates
subset["weekends_count"] = [
pd.date_range(x, y - pd.DateOffset(days=1)).weekday.isin([5, 6]).sum()
for x, y in zip(subset["arrival_dt"], subset["departure_dt"])
]
subset["weekdays_count"] = [
pd.date_range(x, y - pd.DateOffset(days=1)).weekday.isin([0, 1, 2, 3, 4]).sum()
for x, y in zip(subset["arrival_dt"], subset["departure_dt"])
]
# Now let's compare our counts based on datetime to the original counts in the dataset
# How many rows have the correct number of weekend nights?
print(
np.where(
subset["no_of_weekend_nights"] == subset["weekends_count"], True, False
).sum()
)
# How many rows have the correct number of week nights?
print(
np.where(subset["no_of_week_nights"] == subset["weekdays_count"], True, False).sum()
)
# Print the matches and check for pattern
subset.loc[subset["no_of_weekend_nights"] == subset["weekends_count"]].sample(
n=20, random_state=1
)
9996 9996
| no_of_adults | no_of_children | no_of_weekend_nights | no_of_week_nights | type_of_meal_plan | required_car_parking_space | room_type_reserved | lead_time | arrival_year | arrival_month | arrival_date | market_segment_type | repeated_guest | no_of_previous_cancellations | no_of_previous_bookings_not_canceled | avg_price_per_room | no_of_special_requests | booking_status | arrival_dt | length_of_stay | departure_dt | weekends_count | weekdays_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 22562 | 1 | 0 | 0 | 2 | Meal Plan 2 | 0 | Room_Type 1 | 102 | 2017 | 10 | 16 | Offline | 0 | 0 | 0 | 80.00 | 0 | Canceled | 2017-10-16 | 2 | 2017-10-18 | 0 | 2 |
| 3294 | 2 | 0 | 1 | 3 | Not Selected | 0 | Room_Type 1 | 93 | 2018 | 12 | 26 | Online | 0 | 0 | 0 | 92.51 | 2 | Not_Canceled | 2018-12-26 | 4 | 2018-12-30 | 1 | 3 |
| 32860 | 2 | 0 | 0 | 2 | Meal Plan 1 | 1 | Room_Type 1 | 50 | 2018 | 10 | 4 | Online | 0 | 0 | 0 | 127.80 | 2 | Not_Canceled | 2018-10-04 | 2 | 2018-10-06 | 0 | 2 |
| 24037 | 1 | 0 | 0 | 2 | Meal Plan 1 | 0 | Room_Type 1 | 22 | 2018 | 9 | 27 | Online | 0 | 0 | 0 | 133.00 | 1 | Not_Canceled | 2018-09-27 | 2 | 2018-09-29 | 0 | 2 |
| 10824 | 1 | 0 | 0 | 1 | Meal Plan 1 | 0 | Room_Type 1 | 4 | 2018 | 11 | 29 | Corporate | 0 | 0 | 0 | 79.00 | 1 | Not_Canceled | 2018-11-29 | 1 | 2018-11-30 | 0 | 1 |
| 26327 | 2 | 2 | 1 | 2 | Meal Plan 1 | 0 | Room_Type 6 | 0 | 2017 | 8 | 31 | Online | 0 | 0 | 0 | 155.33 | 0 | Not_Canceled | 2017-08-31 | 3 | 2017-09-03 | 1 | 2 |
| 28333 | 1 | 0 | 0 | 1 | Meal Plan 1 | 0 | Room_Type 1 | 34 | 2018 | 1 | 26 | Corporate | 0 | 0 | 0 | 59.00 | 0 | Not_Canceled | 2018-01-26 | 1 | 2018-01-27 | 0 | 1 |
| 8854 | 2 | 1 | 1 | 3 | Meal Plan 1 | 0 | Room_Type 1 | 71 | 2018 | 11 | 28 | Online | 0 | 0 | 0 | 68.07 | 2 | Not_Canceled | 2018-11-28 | 4 | 2018-12-02 | 1 | 3 |
| 1129 | 2 | 0 | 2 | 5 | Not Selected | 0 | Room_Type 1 | 17 | 2018 | 12 | 30 | Online | 0 | 0 | 0 | 90.34 | 1 | Not_Canceled | 2018-12-30 | 7 | 2019-01-06 | 2 | 5 |
| 3416 | 1 | 0 | 0 | 1 | Not Selected | 0 | Room_Type 1 | 1 | 2018 | 8 | 24 | Online | 0 | 0 | 0 | 81.19 | 0 | Not_Canceled | 2018-08-24 | 1 | 2018-08-25 | 0 | 1 |
| 4116 | 2 | 0 | 0 | 2 | Meal Plan 2 | 0 | Room_Type 1 | 102 | 2017 | 10 | 16 | Online | 0 | 0 | 0 | 101.50 | 0 | Canceled | 2017-10-16 | 2 | 2017-10-18 | 0 | 2 |
| 14018 | 2 | 0 | 2 | 5 | Meal Plan 1 | 0 | Room_Type 1 | 31 | 2018 | 11 | 14 | Online | 0 | 0 | 0 | 64.67 | 1 | Not_Canceled | 2018-11-14 | 7 | 2018-11-21 | 2 | 5 |
| 25117 | 1 | 0 | 1 | 3 | Meal Plan 1 | 0 | Room_Type 1 | 22 | 2017 | 9 | 17 | Online | 0 | 0 | 0 | 109.25 | 3 | Not_Canceled | 2017-09-17 | 4 | 2017-09-21 | 1 | 3 |
| 8748 | 2 | 0 | 0 | 1 | Not Selected | 0 | Room_Type 1 | 122 | 2018 | 10 | 4 | Online | 0 | 0 | 0 | 108.00 | 1 | Canceled | 2018-10-04 | 1 | 2018-10-05 | 0 | 1 |
| 6485 | 2 | 0 | 1 | 2 | Meal Plan 2 | 0 | Room_Type 1 | 309 | 2018 | 5 | 13 | Online | 0 | 0 | 0 | 101.00 | 0 | Canceled | 2018-05-13 | 3 | 2018-05-16 | 1 | 2 |
| 17958 | 2 | 0 | 1 | 2 | Meal Plan 1 | 0 | Room_Type 1 | 267 | 2018 | 9 | 30 | Offline | 0 | 0 | 0 | 95.00 | 0 | Canceled | 2018-09-30 | 3 | 2018-10-03 | 1 | 2 |
| 9869 | 2 | 0 | 0 | 1 | Meal Plan 2 | 0 | Room_Type 1 | 219 | 2017 | 9 | 4 | Offline | 0 | 0 | 0 | 108.00 | 0 | Canceled | 2017-09-04 | 1 | 2017-09-05 | 0 | 1 |
| 32676 | 2 | 0 | 1 | 2 | Not Selected | 0 | Room_Type 1 | 229 | 2018 | 12 | 9 | Online | 0 | 0 | 0 | 67.50 | 0 | Canceled | 2018-12-09 | 3 | 2018-12-12 | 1 | 2 |
| 33697 | 2 | 0 | 0 | 2 | Not Selected | 0 | Room_Type 1 | 1 | 2018 | 8 | 2 | Online | 0 | 0 | 0 | 124.00 | 1 | Not_Canceled | 2018-08-02 | 2 | 2018-08-04 | 0 | 2 |
| 27964 | 1 | 0 | 0 | 1 | Meal Plan 1 | 0 | Room_Type 1 | 9 | 2018 | 5 | 17 | Online | 0 | 0 | 0 | 106.26 | 1 | Not_Canceled | 2018-05-17 | 1 | 2018-05-18 | 0 | 1 |
9996 / len(df)
0.2755616815988973
# Let's look at all the bookings over time (minus the 37 invalid dates)
plt.figure(figsize=(16, 8))
sns.histplot(
x="arrival_dt",
data=subset,
hue="booking_status",
# multiple="stack",
bins=100,
kde=True,
hue_order=["Not_Canceled", "Canceled"],
)
plt.xticks(rotation=35)
(array([17348., 17410., 17471., 17532., 17591., 17652., 17713., 17775.,
17836., 17897.]),
[Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, '')])
# For fun, let's look at the same histogram, but with only the 9996 rows with validated data
vdf = subset.loc[subset["no_of_weekend_nights"] == subset["weekends_count"]].copy()
plt.figure(figsize=(16, 8))
sns.histplot(
x="arrival_dt",
data=vdf,
hue="booking_status",
# multiple="stack",
bins=100,
kde=True,
hue_order=["Not_Canceled", "Canceled"],
)
plt.xticks(rotation=35)
(array([17348., 17410., 17471., 17532., 17591., 17652., 17713., 17775.,
17836., 17897.]),
[Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, ''),
Text(0, 0, '')])
## 1. What are the busiest months in the hotel?
# Let's plot 2018 only since we don't have a complete year of 2017
plt.figure(figsize=(8, 5))
sns.histplot(
x="arrival_month", data=df.loc[df["arrival_year"] == 2018], bins=12,
)
# View the value counts of each month in descending order
df.loc[df["arrival_year"] == 2018].groupby("arrival_month")[
"arrival_month"
].count().sort_values(ascending=False)
arrival_month 10 3404 6 3203 9 2962 8 2799 4 2736 5 2598 7 2557 3 2358 11 2333 12 2093 2 1704 1 1014 Name: arrival_month, dtype: int64
The busiest months are October, June, and September. The slowest month is January.
## 2. Which market segment do most of the guests come from?
plt.figure(figsize=(8, 5))
sns.histplot(x="market_segment_type", data=df)
plt.xticks(rotation=45)
df["market_segment_type"].value_counts(normalize=True).mul(100).round(2).astype(str)+"%"
Online 63.99% Offline 29.02% Corporate 5.56% Complementary 1.08% Aviation 0.34% Name: market_segment_type, dtype: object
## 3. What are the differences in room prices in different market segments?
plt.figure(figsize=(8, 5))
sns.barplot(x="market_segment_type", y="avg_price_per_room", data=df)
plt.xticks(rotation=45)
(array([0, 1, 2, 3, 4]), [Text(0, 0, 'Offline'), Text(1, 0, 'Online'), Text(2, 0, 'Corporate'), Text(3, 0, 'Aviation'), Text(4, 0, 'Complementary')])
## 4. What are the differences in room type in different market segments?
plt.figure(figsize=(13, 5))
sns.catplot(
hue="room_type_reserved",
x="market_segment_type",
kind="count",
edgecolor=".6",
data=df,
height=5,
aspect=2,
)
plt.xticks(rotation=45)
(array([0, 1, 2, 3, 4]), [Text(0, 0, 'Offline'), Text(1, 0, 'Online'), Text(2, 0, 'Corporate'), Text(3, 0, 'Aviation'), Text(4, 0, 'Complementary')])
<Figure size 936x360 with 0 Axes>
# function to plot stacked bar chart
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 5, 6))
plt.legend(
loc="lower left", frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.xticks(rotation=45)
plt.show()
stacked_barplot(df, "market_segment_type", "room_type_reserved")
room_type_reserved Room_Type 1 Room_Type 2 Room_Type 3 Room_Type 4 \ market_segment_type All 28130 692 7 6057 Complementary 247 20 2 52 Offline 9747 57 2 613 Online 16243 613 2 5228 Corporate 1833 2 1 99 Aviation 60 0 0 65 room_type_reserved Room_Type 5 Room_Type 6 Room_Type 7 All market_segment_type All 265 966 158 36275 Complementary 17 14 39 391 Offline 81 23 5 10528 Online 93 926 109 23214 Corporate 74 3 5 2017 Aviation 0 0 0 125 ------------------------------------------------------------------------------------------------------------------------
## 5. What percentage of bookings are canceled?
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=45, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
labeled_barplot(df, "booking_status", perc=True)
## 6. What percentage of repeating guests cancel?
labeled_barplot(df.loc[df["repeated_guest"] == 1], "booking_status", perc=True)
## 7. How are all the variables correlated to booking status?
# Change booking status to 0 and 1
df["booking_status"] = df["booking_status"].map({"Canceled": 1, "Not_Canceled": 0})
# Plot correlation heatmap
plt.figure(figsize=(25, 12))
sns.heatmap(df.corr(), annot=True, vmin=-1, vmax=1, cmap="Spectral")
# Print correlations in descending order
print(df.corr()["booking_status"].sort_values(ascending=False))
# Change booking status back
df["booking_status"] = df["booking_status"].map({1: "Canceled", 0: "Not_Canceled"})
booking_status 1.000000 lead_time 0.438538 arrival_year 0.179529 avg_price_per_room 0.142569 no_of_week_nights 0.092996 no_of_adults 0.086920 no_of_weekend_nights 0.061563 no_of_children 0.033078 arrival_date 0.010629 arrival_month -0.011233 no_of_previous_cancellations -0.033728 no_of_previous_bookings_not_canceled -0.060179 required_car_parking_space -0.086185 repeated_guest -0.107287 no_of_special_requests -0.253070 Name: booking_status, dtype: float64
arrival_month and arrival_date have the least correlation with booking_status and they should be treated as categories and not ordinal numbers.lead_time (positive) and no_of_special_requests (negative)## 8. How do special requirements affect booking cancellation?
sns.barplot(data=df, x="booking_status", y="no_of_special_requests")
<AxesSubplot:xlabel='booking_status', ylabel='no_of_special_requests'>
stacked_barplot(df, "booking_status", "no_of_special_requests")
no_of_special_requests 0 1 2 3 4 5 All booking_status Not_Canceled 11232 8670 3727 675 78 8 24390 All 19777 11373 4364 675 78 8 36275 Canceled 8545 2703 637 0 0 0 11885 ------------------------------------------------------------------------------------------------------------------------
# Look at a histogram of each variable
for i in df.columns[:-2]: # skip last two which are booking_status and arrival_dt
plt.figure(figsize=(13, 5))
sns.histplot(data=df, x=i, hue="booking_status", multiple="stack")
plt.show()
# Are all free rooms in the complementary segment?
free_rooms = df.loc[df["avg_price_per_room"] == 0]
plt.figure(figsize=(8, 5))
sns.histplot(
x="market_segment_type", data=free_rooms, hue="booking_status", multiple="stack"
)
<AxesSubplot:xlabel='market_segment_type', ylabel='Count'>
# How often are complementary rooms canceled?
comp_rooms = df.loc[df["market_segment_type"] == "Complementary"]
plt.figure(figsize=(8, 5))
sns.histplot(
x="avg_price_per_room", data=comp_rooms, hue="booking_status",
)
<AxesSubplot:xlabel='avg_price_per_room', ylabel='Count'>
## Which room type is the most expensive?
plt.figure(figsize=(15, 5))
sns.barplot(
data=df, x="room_type_reserved", y="avg_price_per_room", hue="booking_status"
)
<AxesSubplot:xlabel='room_type_reserved', ylabel='avg_price_per_room'>
# 9. How has the average price of rooms changed over time?
plt.figure(figsize=(13, 8))
sns.lineplot(data=subset, x="arrival_dt", y="avg_price_per_room")
<AxesSubplot:xlabel='arrival_dt', ylabel='avg_price_per_room'>
# drop and one-hot encode
df.drop(["arrival_date"], axis=1, inplace=True)
df.drop(["arrival_dt"], axis=1, inplace=True)
df.drop(["repeated_guest"], axis=1, inplace=True)
df["booking_status"] = df["booking_status"].map({"Canceled": 1, "Not_Canceled": 0})
df["type_of_meal_plan"] = df["type_of_meal_plan"].map(
{"Not Selected": 0, "Meal Plan 1": 1, "Meal Plan 2": 2, "Meal Plan 3": 3}
)
df = pd.get_dummies(
df,
columns=["arrival_month", "room_type_reserved", "market_segment_type",],
drop_first=True,
)
df.head()
| no_of_adults | no_of_children | no_of_weekend_nights | no_of_week_nights | type_of_meal_plan | required_car_parking_space | lead_time | arrival_year | no_of_previous_cancellations | no_of_previous_bookings_not_canceled | avg_price_per_room | no_of_special_requests | booking_status | arrival_month_2 | arrival_month_3 | arrival_month_4 | arrival_month_5 | arrival_month_6 | arrival_month_7 | arrival_month_8 | arrival_month_9 | arrival_month_10 | arrival_month_11 | arrival_month_12 | room_type_reserved_Room_Type 2 | room_type_reserved_Room_Type 3 | room_type_reserved_Room_Type 4 | room_type_reserved_Room_Type 5 | room_type_reserved_Room_Type 6 | room_type_reserved_Room_Type 7 | market_segment_type_Complementary | market_segment_type_Corporate | market_segment_type_Offline | market_segment_type_Online | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 0 | 1 | 2 | 1 | 0 | 224 | 2017 | 0 | 0 | 65.00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 2 | 0 | 2 | 3 | 0 | 0 | 5 | 2018 | 0 | 0 | 106.68 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2 | 1 | 0 | 2 | 1 | 1 | 0 | 1 | 2018 | 0 | 0 | 60.00 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | 2 | 0 | 0 | 2 | 1 | 0 | 211 | 2018 | 0 | 0 | 100.00 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4 | 2 | 0 | 1 | 1 | 0 | 0 | 48 | 2018 | 0 | 0 | 94.50 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
It's a good idea to explore the data again after manipulations
df.describe(include="all").T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| no_of_adults | 36275.0 | 1.844962 | 0.518715 | 0.0 | 2.0 | 2.00 | 2.0 | 4.0 |
| no_of_children | 36275.0 | 0.105279 | 0.402648 | 0.0 | 0.0 | 0.00 | 0.0 | 10.0 |
| no_of_weekend_nights | 36275.0 | 0.810724 | 0.870644 | 0.0 | 0.0 | 1.00 | 2.0 | 7.0 |
| no_of_week_nights | 36275.0 | 2.204300 | 1.410905 | 0.0 | 1.0 | 2.00 | 3.0 | 17.0 |
| type_of_meal_plan | 36275.0 | 0.949966 | 0.480191 | 0.0 | 1.0 | 1.00 | 1.0 | 3.0 |
| required_car_parking_space | 36275.0 | 0.030986 | 0.173281 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| lead_time | 36275.0 | 85.232557 | 85.930817 | 0.0 | 17.0 | 57.00 | 126.0 | 443.0 |
| arrival_year | 36275.0 | 2017.820427 | 0.383836 | 2017.0 | 2018.0 | 2018.00 | 2018.0 | 2018.0 |
| no_of_previous_cancellations | 36275.0 | 0.023349 | 0.368331 | 0.0 | 0.0 | 0.00 | 0.0 | 13.0 |
| no_of_previous_bookings_not_canceled | 36275.0 | 0.153411 | 1.754171 | 0.0 | 0.0 | 0.00 | 0.0 | 58.0 |
| avg_price_per_room | 36275.0 | 103.423539 | 35.089424 | 0.0 | 80.3 | 99.45 | 120.0 | 540.0 |
| no_of_special_requests | 36275.0 | 0.619655 | 0.786236 | 0.0 | 0.0 | 0.00 | 1.0 | 5.0 |
| booking_status | 36275.0 | 0.327636 | 0.469358 | 0.0 | 0.0 | 0.00 | 1.0 | 1.0 |
| arrival_month_2 | 36275.0 | 0.046975 | 0.211587 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| arrival_month_3 | 36275.0 | 0.065003 | 0.246535 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| arrival_month_4 | 36275.0 | 0.075424 | 0.264078 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| arrival_month_5 | 36275.0 | 0.071620 | 0.257861 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| arrival_month_6 | 36275.0 | 0.088298 | 0.283731 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| arrival_month_7 | 36275.0 | 0.080496 | 0.272064 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| arrival_month_8 | 36275.0 | 0.105114 | 0.306704 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| arrival_month_9 | 36275.0 | 0.127112 | 0.333103 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| arrival_month_10 | 36275.0 | 0.146575 | 0.353686 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| arrival_month_11 | 36275.0 | 0.082150 | 0.274597 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| arrival_month_12 | 36275.0 | 0.083280 | 0.276310 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| room_type_reserved_Room_Type 2 | 36275.0 | 0.019076 | 0.136796 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| room_type_reserved_Room_Type 3 | 36275.0 | 0.000193 | 0.013890 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| room_type_reserved_Room_Type 4 | 36275.0 | 0.166975 | 0.372958 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| room_type_reserved_Room_Type 5 | 36275.0 | 0.007305 | 0.085159 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| room_type_reserved_Room_Type 6 | 36275.0 | 0.026630 | 0.161001 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| room_type_reserved_Room_Type 7 | 36275.0 | 0.004356 | 0.065854 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| market_segment_type_Complementary | 36275.0 | 0.010779 | 0.103261 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| market_segment_type_Corporate | 36275.0 | 0.055603 | 0.229157 | 0.0 | 0.0 | 0.00 | 0.0 | 1.0 |
| market_segment_type_Offline | 36275.0 | 0.290227 | 0.453873 | 0.0 | 0.0 | 0.00 | 1.0 | 1.0 |
| market_segment_type_Online | 36275.0 | 0.639945 | 0.480023 | 0.0 | 0.0 | 1.00 | 1.0 | 1.0 |
# Look at a histogram of each variable
for i in df.drop(["booking_status"], axis=1).columns: # skip booking_status
plt.figure(figsize=(8, 5))
sns.histplot(data=df, x=i, hue="booking_status", kde=True)
plt.show()
# Reprint booking status correlations in descending order
print(df.corr()["booking_status"].sort_values(ascending=False))
booking_status 1.000000 lead_time 0.438538 arrival_year 0.179529 avg_price_per_room 0.142569 market_segment_type_Online 0.106362 no_of_week_nights 0.092996 no_of_adults 0.086920 arrival_month_7 0.077138 no_of_weekend_nights 0.061563 arrival_month_6 0.050010 type_of_meal_plan 0.049374 arrival_month_8 0.045717 no_of_children 0.033078 room_type_reserved_Room_Type 6 0.032652 arrival_month_10 0.022910 arrival_month_5 0.022049 arrival_month_4 0.021928 room_type_reserved_Room_Type 4 0.013309 arrival_month_9 0.004808 room_type_reserved_Room_Type 2 0.000548 room_type_reserved_Room_Type 3 -0.001241 room_type_reserved_Room_Type 5 -0.010224 room_type_reserved_Room_Type 7 -0.014062 arrival_month_3 -0.017288 arrival_month_11 -0.021680 no_of_previous_cancellations -0.033728 arrival_month_2 -0.035613 market_segment_type_Offline -0.038351 no_of_previous_bookings_not_canceled -0.060179 market_segment_type_Complementary -0.072867 required_car_parking_space -0.086185 market_segment_type_Corporate -0.112993 arrival_month_12 -0.124947 no_of_special_requests -0.253070 Name: booking_status, dtype: float64
lead_time still has the highest correlation with cancellations, followed by arrival_year and avg_price_per_roomOnline market segment type appears to have a slight positive correlation with cancellations as welly = df["booking_status"].copy()
X = df.drop(["booking_status"], axis=1)
X = sm.add_constant(X)
vif_series = pd.Series(
[variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
index=X.columns,
dtype=float,
)
print("Series before feature selection: \n\n{}\n".format(vif_series))
Series before feature selection: const 4.311667e+07 no_of_adults 1.377761e+00 no_of_children 2.020617e+00 no_of_weekend_nights 1.074599e+00 no_of_week_nights 1.097486e+00 type_of_meal_plan 1.448025e+00 required_car_parking_space 1.035314e+00 lead_time 1.523262e+00 arrival_year 1.559616e+00 no_of_previous_cancellations 1.300598e+00 no_of_previous_bookings_not_canceled 1.418885e+00 avg_price_per_room 2.471721e+00 no_of_special_requests 1.253138e+00 arrival_month_2 2.578339e+00 arrival_month_3 3.176283e+00 arrival_month_4 3.560207e+00 arrival_month_5 3.561513e+00 arrival_month_6 4.124826e+00 arrival_month_7 3.945392e+00 arrival_month_8 4.796063e+00 arrival_month_9 5.669893e+00 arrival_month_10 6.099917e+00 arrival_month_11 3.834533e+00 arrival_month_12 3.904557e+00 room_type_reserved_Room_Type 2 1.091278e+00 room_type_reserved_Room_Type 3 1.002181e+00 room_type_reserved_Room_Type 4 1.312286e+00 room_type_reserved_Room_Type 5 1.029945e+00 room_type_reserved_Room_Type 6 2.009079e+00 room_type_reserved_Room_Type 7 1.097208e+00 market_segment_type_Complementary 4.419565e+00 market_segment_type_Corporate 1.656967e+01 market_segment_type_Offline 6.263492e+01 market_segment_type_Online 6.951248e+01 dtype: float64
# Splitting data in train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.30, random_state=1
)
# fitting logistic regression model
logit = sm.Logit(y_train, X_train.astype(float))
lg = logit.fit(disp=False)
print(lg.summary())
Logit Regression Results
==============================================================================
Dep. Variable: booking_status No. Observations: 25392
Model: Logit Df Residuals: 25358
Method: MLE Df Model: 33
Date: Fri, 19 Nov 2021 Pseudo R-squ.: 0.3473
Time: 20:18:51 Log-Likelihood: -10503.
converged: False LL-Null: -16091.
Covariance Type: nonrobust LLR p-value: 0.000
========================================================================================================
coef std err z P>|z| [0.025 0.975]
--------------------------------------------------------------------------------------------------------
const -927.8714 127.420 -7.282 0.000 -1177.610 -678.133
no_of_adults 0.1611 0.039 4.144 0.000 0.085 0.237
no_of_children 0.1883 0.060 3.148 0.002 0.071 0.306
no_of_weekend_nights 0.1128 0.020 5.590 0.000 0.073 0.152
no_of_week_nights 0.0439 0.013 3.475 0.001 0.019 0.069
type_of_meal_plan -0.0998 0.043 -2.305 0.021 -0.185 -0.015
required_car_parking_space -1.5999 0.139 -11.528 0.000 -1.872 -1.328
lead_time 0.0162 0.000 56.037 0.000 0.016 0.017
arrival_year 0.4573 0.063 7.243 0.000 0.334 0.581
no_of_previous_cancellations 0.1899 0.112 1.691 0.091 -0.030 0.410
no_of_previous_bookings_not_canceled -0.7533 0.247 -3.044 0.002 -1.238 -0.268
avg_price_per_room 0.0194 0.001 23.146 0.000 0.018 0.021
no_of_special_requests -1.5169 0.031 -49.447 0.000 -1.577 -1.457
arrival_month_2 2.8355 0.267 10.617 0.000 2.312 3.359
arrival_month_3 2.6385 0.264 9.992 0.000 2.121 3.156
arrival_month_4 2.3550 0.263 8.953 0.000 1.839 2.871
arrival_month_5 1.9507 0.265 7.351 0.000 1.431 2.471
arrival_month_6 2.1855 0.264 8.282 0.000 1.668 2.703
arrival_month_7 2.1124 0.265 7.967 0.000 1.593 2.632
arrival_month_8 2.1155 0.265 7.993 0.000 1.597 2.634
arrival_month_9 2.0380 0.265 7.676 0.000 1.518 2.558
arrival_month_10 2.2882 0.264 8.672 0.000 1.771 2.805
arrival_month_11 2.6646 0.265 10.040 0.000 2.144 3.185
arrival_month_12 0.7978 0.272 2.934 0.003 0.265 1.331
room_type_reserved_Room_Type 2 -0.3669 0.135 -2.720 0.007 -0.631 -0.103
room_type_reserved_Room_Type 3 -0.2362 1.304 -0.181 0.856 -2.793 2.320
room_type_reserved_Room_Type 4 -0.3464 0.053 -6.575 0.000 -0.450 -0.243
room_type_reserved_Room_Type 5 -0.8091 0.212 -3.821 0.000 -1.224 -0.394
room_type_reserved_Room_Type 6 -1.0180 0.152 -6.712 0.000 -1.315 -0.721
room_type_reserved_Room_Type 7 -1.5090 0.299 -5.051 0.000 -2.094 -0.923
market_segment_type_Complementary -22.6138 1.07e+04 -0.002 0.998 -2.11e+04 2.1e+04
market_segment_type_Corporate -1.1401 0.262 -4.351 0.000 -1.654 -0.627
market_segment_type_Offline -2.1020 0.251 -8.378 0.000 -2.594 -1.610
market_segment_type_Online -0.2766 0.248 -1.117 0.264 -0.762 0.209
========================================================================================================
# defining a function to compute different metrics to check performance of a classification model built using statsmodels
def model_performance_classification_statsmodels(
model, predictors, target, threshold=0.5
):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
threshold: threshold for classifying the observation as class 1
"""
# checking which probabilities are greater than threshold
pred_temp = model.predict(predictors) > threshold
# rounding off the above values to get classes
pred = np.round(pred_temp)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{"Accuracy": acc, "Recall": recall, "Precision": precision, "F1": f1,},
index=[0],
)
return df_perf
print("Training performance:")
model_performance_classification_statsmodels(lg, X_train, y_train)
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.807656 | 0.646299 | 0.73728 | 0.688798 |
# running a loop to drop variables with high p-value
# initial list of columns
cols = X_train.columns.tolist()
# setting an initial max p-value
max_p_value = 1
while len(cols) > 0:
# defining the train set
X_train_aux = X_train[cols]
# fitting the model
model = sm.Logit(y_train, X_train_aux).fit(disp=False)
# getting the p-values and the maximum p-value
p_values = model.pvalues
max_p_value = max(p_values)
# name of the variable with maximum p-value
feature_with_p_max = p_values.idxmax()
if max_p_value > 0.05:
cols.remove(feature_with_p_max)
else:
break
selected_features = cols
print(selected_features)
['const', 'no_of_adults', 'no_of_children', 'no_of_weekend_nights', 'no_of_week_nights', 'type_of_meal_plan', 'required_car_parking_space', 'lead_time', 'arrival_year', 'no_of_previous_bookings_not_canceled', 'avg_price_per_room', 'no_of_special_requests', 'arrival_month_2', 'arrival_month_3', 'arrival_month_4', 'arrival_month_5', 'arrival_month_6', 'arrival_month_7', 'arrival_month_8', 'arrival_month_9', 'arrival_month_10', 'arrival_month_11', 'arrival_month_12', 'room_type_reserved_Room_Type 2', 'room_type_reserved_Room_Type 4', 'room_type_reserved_Room_Type 5', 'room_type_reserved_Room_Type 6', 'room_type_reserved_Room_Type 7', 'market_segment_type_Corporate', 'market_segment_type_Offline']
X_train2 = X_train[selected_features]
# fitting logistic regression model with only statistically significant variables (low p-values)
logit2 = sm.Logit(y_train, X_train2.astype(float))
lg2 = logit2.fit(disp=False)
print("Second training performance after removing high P-values:")
model_performance_classification_statsmodels(lg2, X_train2, y_train)
Second training performance after removing high P-values:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.807262 | 0.645462 | 0.736727 | 0.688082 |
print("Test performance on first model with 0.5 threshold:")
logistic_reg_0_5_perf_test = model_performance_classification_statsmodels(
lg, X_test, y_test
)
logistic_reg_0_5_perf_test
Test performance on first model with 0.5 threshold:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.809152 | 0.647076 | 0.732091 | 0.686963 |
# defining a function to plot the confusion_matrix of a classification model
def confusion_matrix_statsmodels(model, predictors, target, threshold=0.5):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
threshold: threshold for classifying the observation as class 1
"""
y_pred = model.predict(predictors) > threshold
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
# creating confusion matrix
confusion_matrix_statsmodels(lg, X_train, y_train)
True Positives (TP)(1,1): we correctly predicted cancellation 21%
True Negatives (TN)(0,0): we correctly predicted NO cancellation 60%
False Positives (FP)(1,0): we incorrectly predicted cancellation (a "Type I error") 7%
False Negatives (FN)(0,1): we incorrectly predicted NO cancellation (a "Type II error") 12%
If the hotel predicts a booking will cancel (1) and they actually don't (0) (Type I error), the hotel might be understaffed and overbooked and customer service might suffer.
If the hotel predicts a booking will not cancel (0) and they actually do (1) (Type II error), the hotel loses revenue on the room and is subject to costly, last-minute changes.
logit_roc_auc_train = roc_auc_score(y_train, lg.predict(X_train))
fpr, tpr, thresholds = roc_curve(y_train, lg.predict(X_train))
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label="Logistic Regression (area = %0.2f)" % logit_roc_auc_train)
plt.plot([0, 1], [0, 1], "r--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()
# Optimal threshold as per AUC-ROC curve
# The optimal cut off would be where tpr is high and fpr is low
fpr, tpr, thresholds = roc_curve(y_train, lg.predict(X_train))
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold_auc_roc = thresholds[optimal_idx]
print(optimal_threshold_auc_roc)
confusion_matrix_statsmodels(lg, X_train, y_train, threshold=optimal_threshold_auc_roc)
0.380973728094207
On the training set, True Positives went up from 21% to 25%, and False Negatives went down from 12% to 8%
See below that training Recall went up from 65% to 77%
# checking model performance for this model
log_reg_model_train_perf_threshold_auc_roc = model_performance_classification_statsmodels(
lg, X_train, y_train, threshold=optimal_threshold_auc_roc
)
print(
"Training performance after changing threshold to .34 and reducing Type II Errors:"
)
log_reg_model_train_perf_threshold_auc_roc
Training performance after changing threshold to .34 and reducing Type II Errors:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.796668 | 0.740285 | 0.674254 | 0.705728 |
# checking model performance for this model
log_reg_model_test_perf_threshold_auc_roc = model_performance_classification_statsmodels(
lg, X_test, y_test, threshold=optimal_threshold_auc_roc
)
print("Test performance after changing threshold to .34 and reducing Type II Errors:")
log_reg_model_test_perf_threshold_auc_roc
Test performance after changing threshold to .34 and reducing Type II Errors:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.79932 | 0.747871 | 0.670229 | 0.706924 |
y_scores = lg.predict(X_train)
prec, rec, tre = precision_recall_curve(y_train, y_scores,)
def plot_prec_recall_vs_tresh(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="precision")
plt.plot(thresholds, recalls[:-1], "g--", label="recall")
plt.xlabel("Threshold")
plt.legend(loc="upper left")
plt.ylim([0, 1])
plt.figure(figsize=(10, 7))
plot_prec_recall_vs_tresh(prec, rec, tre)
plt.show()
# setting the threshold
optimal_threshold_curve = 0.44
# checking model performance for this model
log_reg_model_train_perf_threshold_pr = model_performance_classification_statsmodels(
lg, X_train, y_train, threshold=optimal_threshold_curve
)
print("Training performance after changing threshold to .44:")
log_reg_model_train_perf_threshold_pr
Training performance after changing threshold to .44:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.803403 | 0.696401 | 0.703637 | 0.7 |
log_reg_model_test_perf_threshold_pr = model_performance_classification_statsmodels(
lg, X_test, y_test, threshold=optimal_threshold_curve
)
print("Test performance after changing threshold to .44:")
log_reg_model_test_perf_threshold_pr
Test performance after changing threshold to .44:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.804374 | 0.697615 | 0.697813 | 0.697714 |
model = DecisionTreeClassifier(
criterion="gini", class_weight={0: 0.15, 1: 0.85}, random_state=1
)
model.fit(X_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.15, 1: 0.85}, random_state=1)
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
## Function to calculate recall score
def get_recall_score(model, predictors, target):
"""
model: classifier
predictors: independent variables
target: dependent variable
"""
prediction = model.predict(predictors)
return recall_score(target, prediction)
confusion_matrix_sklearn(model, X_train, y_train)
decision_tree_perf_train = get_recall_score(model, X_train, y_train)
print("Training Recall Score:", decision_tree_perf_train)
Training Recall Score: 0.9983259595838814
# Check model on TEST set
confusion_matrix_sklearn(model, X_test, y_test)
decision_tree_perf_test = get_recall_score(model, X_test, y_test)
print("Test Recall Score:", decision_tree_perf_test)
Test Recall Score: 0.7955706984667802
decision_tree_original_perf_test = model_performance_classification_statsmodels(
model, X_test, y_test
)
decision_tree_original_perf_test
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.861343 | 0.795571 | 0.780284 | 0.787853 |
## creating a list of column names
feature_names = X_train.columns.to_list()
importances = model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
# Choose the type of classifier.
estimator = DecisionTreeClassifier(random_state=1, class_weight={0: 0.15, 1: 0.85})
# Grid of parameters to choose from
parameters = {
"max_depth": [5, 7, 8, 10, 15, None],
"criterion": ["entropy", "gini"],
"splitter": ["best", "random"],
"min_impurity_decrease": [0.00001, 0.0001, 0.01],
}
# Type of scoring used to compare parameter combinations
scorer = make_scorer(recall_score)
# Run the grid search (cv is cross validation)
grid_obj = GridSearchCV(estimator, parameters, scoring=scorer, cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
estimator.fit(X_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.15, 1: 0.85}, max_depth=7,
min_impurity_decrease=1e-05, random_state=1,
splitter='random')
# Confusion matrix on training
confusion_matrix_sklearn(estimator, X_train, y_train)
decision_tree_tune_perf_train = get_recall_score(estimator, X_train, y_train)
print("Training Recall Score:", decision_tree_tune_perf_train)
Training Recall Score: 0.9569532464426641
# Confusion matrix on testing
confusion_matrix_sklearn(estimator, X_test, y_test)
decision_tree_tune_perf_test = get_recall_score(estimator, X_test, y_test)
print("TEST Recall Score:", decision_tree_tune_perf_test)
TEST Recall Score: 0.9517319704713231
decision_tree_prepruned_perf_test = model_performance_classification_statsmodels(
estimator, X_test, y_test
)
decision_tree_prepruned_perf_test
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.610493 | 0.951732 | 0.451691 | 0.612629 |
# plot the hyperparameter-tuned decision tree
plt.figure(figsize=(15, 10))
out = tree.plot_tree(
estimator,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
importances = estimator.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
clf = DecisionTreeClassifier(random_state=1)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
pd.DataFrame(path)
| ccp_alphas | impurities | |
|---|---|---|
| 0 | 0.000000e+00 | 0.008309 |
| 1 | 0.000000e+00 | 0.008309 |
| 2 | 1.842455e-07 | 0.008309 |
| 3 | 4.375831e-07 | 0.008309 |
| 4 | 4.688391e-07 | 0.008310 |
| ... | ... | ... |
| 1432 | 6.665684e-03 | 0.286897 |
| 1433 | 1.304480e-02 | 0.299942 |
| 1434 | 1.725993e-02 | 0.317202 |
| 1435 | 2.399048e-02 | 0.365183 |
| 1436 | 7.657789e-02 | 0.441761 |
1437 rows × 2 columns
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.show()
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=1, ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
clfs.append(clf)
print(
"Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1]
)
)
Number of nodes in the last tree is: 1 with ccp_alpha: 0.07657789477371346
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1, figsize=(15, 8))
ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()
recall_train = []
for clf in clfs:
pred_train = clf.predict(X_train)
values_train = recall_score(y_train, pred_train)
recall_train.append(values_train)
recall_test = []
for clf in clfs:
pred_test = clf.predict(X_test)
values_test = recall_score(y_test, pred_test)
recall_test.append(values_test)
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel("alpha")
ax.set_ylabel("Recall")
ax.set_title("Recall vs alpha for training and testing sets")
ax.plot(
ccp_alphas, recall_train, marker="o", label="train", drawstyle="steps-post",
)
ax.plot(ccp_alphas, recall_test, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker="o", label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()
# creating the model where we get highest train and test recall
index_best_model = np.argmax(recall_test)
best_model = clfs[index_best_model]
print(best_model)
DecisionTreeClassifier(ccp_alpha=1.3127494223902537e-05, random_state=1)
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{"Accuracy": acc, "Recall": recall, "Precision": precision, "F1": f1,},
index=[0],
)
return df_perf
decision_tree_postpruned_perf_train = model_performance_classification_sklearn(
best_model, X_train, y_train
)
print("Post-pruned training scores:")
decision_tree_postpruned_perf_train
Post-pruned training scores:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.993541 | 0.98601 | 0.994333 | 0.990154 |
decision_tree_postpruned_perf_test = model_performance_classification_sklearn(
best_model, X_test, y_test
)
print("Post-pruned test scores:")
decision_tree_postpruned_perf_test
Post-pruned test scores:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.866673 | 0.802669 | 0.789004 | 0.795778 |
print("Post-pruned training set matrix")
confusion_matrix_sklearn(best_model, X_train, y_train)
Post-pruned training set matrix
print("Post-pruned test matrix")
confusion_matrix_sklearn(best_model, X_test, y_test)
Post-pruned test matrix
# testing performance comparison
model_test_perf = pd.concat(
[
logistic_reg_0_5_perf_test,
log_reg_model_test_perf_threshold_auc_roc,
log_reg_model_test_perf_threshold_pr,
decision_tree_original_perf_test,
decision_tree_prepruned_perf_test,
decision_tree_postpruned_perf_test,
]
)
model_test_perf.index = [
"Logistic 0.5",
"Logistic ROC 0.34",
"Logistic 0.44",
"Decision Tree 0",
"Tree Pre-pruned",
"Tree Post-pruned",
]
print("Testing Performance of all models:")
model_test_perf
Testing Performance of all models:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| Logistic 0.5 | 0.809152 | 0.647076 | 0.732091 | 0.686963 |
| Logistic ROC 0.34 | 0.799320 | 0.747871 | 0.670229 | 0.706924 |
| Logistic 0.44 | 0.804374 | 0.697615 | 0.697813 | 0.697714 |
| Decision Tree 0 | 0.861343 | 0.795571 | 0.780284 | 0.787853 |
| Tree Pre-pruned | 0.610493 | 0.951732 | 0.451691 | 0.612629 |
| Tree Post-pruned | 0.866673 | 0.802669 | 0.789004 | 0.795778 |
importances = best_model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
lead_time, avg_price_per_room, Online, and no_of_special_requests